This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com.

When you click the Knit button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document. You can embed an R code chunk like this: ###load packages

install.packages(“ggplot2”) install.packages(“data.table”) install.packages(“ggrepel”)

library(ggplot2)
library(reshape2)
library(ggrepel)
#require(data.table)

setup working directory

datadir <- "~/Desktop/Lab\ research/Barseq_SGA/N-lim-4mutants/seqdata/"
dir(datadir)
##  [1] "barseq_Nlim_cache"                              
##  [2] "barseq_Nlim.Rmd"                                
##  [3] "filterData (0MM) without zero counts sample.csv"
##  [4] "GeneInter_N_lim.csv"                            
##  [5] "n.barnone.0MM.counts.csv"                       
##  [6] "n.barnone.0MM.counts.txt"                       
##  [7] "n.barnone.1MM.counts.csv"                       
##  [8] "n.barnone.1MM.counts.txt"                       
##  [9] "n.barnone.2MM.counts.csv"                       
## [10] "n.barnone.2MM.counts.txt"                       
## [11] "ScreenCounts201605050MM.csv"                    
## [12] "ScreenCounts201605050MMMelted.csv"              
## [13] "ScreenCounts201605051MM.csv"                    
## [14] "ScreenCounts201605051MMMelted.csv"              
## [15] "ScreenCounts201605052MM.csv"                    
## [16] "ScreenCounts201605052MMMelted.csv"              
## [17] "strain counts distribution"

read in index file with pre-known information (self made-varied depends on experiment setup)

index <- read.csv(paste0(datadir, "GeneInter_N_lim.csv"))
rownames(index)<-index$SampleIndex
head(index)
##          Index LibraryName SampleIndex StarvingMedia Replicate
## Sample1      1          HO     Sample1         N-lim         1
## Sample7      2          HO     Sample7         N-lim         1
## Sample13     3          HO    Sample13         N-lim         1
## Sample19     4          HO    Sample19         N-lim         1
## Sample25     5          HO    Sample25         N-lim         1
## Sample31     6          HO    Sample31         N-lim         1
##          SamplingTime PrimerIndex
## Sample1            0h           1
## Sample7            9h           7
## Sample13          14h          13
## Sample19          18h          19
## Sample25          24h          25
## Sample31          32h          31

Counts tables, melted

Here we make a list of data.frames, where each data.frame is just the counts table of a different Barnone run, with each a different mismatch allowed.

allFileNames <-  dir(datadir)[grepl("counts.*txt", dir(datadir))]
rawCountsList <- list()
for (fz in allFileNames) {
  numberMismatches <- sub(".*(\\dMM).*","\\1",fz)
  rawCountsList[[numberMismatches]] <- read.table(paste0(datadir,fz),header=T)
}
rawCountsList[[1]][1:5,1:5]
##    Strain Sample9_UP Sample9_DOWN Sample111_UP Sample111_DOWN
## 1 YAL008W          0            0            0              0
## 2 YBR255W        472            0           71              0
## 3 YGR164W          0           66            0            546
## 4 YGR131W          0            0            0              0
## 5 YNL003C          1            0          348              0
#Below, we melt the data into a tidy long data.frame:
datar <- list()
for (MM in names(rawCountsList)) {
  mcounts <- melt(rawCountsList[[MM]], id.vars="Strain", value.name="Counts")
  mcounts[,ncol(mcounts)+1:2] <- colsplit(mcounts$variable,"_", c("SampleNum","Tag"))
 #aggregate counts in each sample collection -- from same set of genomic DNA
  sampleTotalCounts <- aggregate(Counts~variable,data=mcounts,sum)
  rownames(sampleTotalCounts) <- sampleTotalCounts$variable
 #generate the total counts for each sample collection
  mcounts$TotalCounts <- sampleTotalCounts[mcounts$variable,"Counts"]
 #ratio of certain strain counts over total counts in each sample
  mcounts$RelativeCounts <- mcounts$Counts / mcounts$TotalCounts
  mcounts<-subset(mcounts, mcounts$SampleNum %in% index$SampleIndex)
  mcounts[,ncol(mcounts)+1:7] <- index[mcounts$SampleNum,
    c("Index","LibraryName","SampleIndex","StarvingMedia","Replicate","SamplingTime","PrimerIndex")]
  datar[[MM]] <- mcounts[,c("Strain","SampleNum","Tag",
      "Index","LibraryName","SampleIndex","StarvingMedia","Replicate","SamplingTime","PrimerIndex",
      "Counts","RelativeCounts")]
}

#A sample of what that looks like:
datar[[1]][1:5,1:10]
##    Strain SampleNum Tag Index LibraryName SampleIndex StarvingMedia
## 1 YAL008W   Sample9  UP    22          HO     Sample9         N-lim
## 2 YBR255W   Sample9  UP    22          HO     Sample9         N-lim
## 3 YGR164W   Sample9  UP    22          HO     Sample9         N-lim
## 4 YGR131W   Sample9  UP    22          HO     Sample9         N-lim
## 5 YNL003C   Sample9  UP    22          HO     Sample9         N-lim
##   Replicate SamplingTime PrimerIndex
## 1         3           9h           9
## 2         3           9h           9
## 3         3           9h           9
## 4         3           9h           9
## 5         3           9h           9
#select my data from the whole sequencing library mixture

mydatar<-list()
for (MM in names(datar)) {
  mydatar[[MM]] <- subset(datar[[MM]],datar[[MM]]$SampleNum%in%
  index$SampleIndex)
}
mydatar[[1]][1:5,1:10]
##    Strain SampleNum Tag Index LibraryName SampleIndex StarvingMedia
## 1 YAL008W   Sample9  UP    22          HO     Sample9         N-lim
## 2 YBR255W   Sample9  UP    22          HO     Sample9         N-lim
## 3 YGR164W   Sample9  UP    22          HO     Sample9         N-lim
## 4 YGR131W   Sample9  UP    22          HO     Sample9         N-lim
## 5 YNL003C   Sample9  UP    22          HO     Sample9         N-lim
##   Replicate SamplingTime PrimerIndex
## 1         3           9h           9
## 2         3           9h           9
## 3         3           9h           9
## 4         3           9h           9
## 5         3           9h           9

Prelim QC analysis - which libraries are good?

Mismatch parameter?

#So which mismatch tolerance to use? The assumption is that some barcodes ain’t perfect, and that Barnone will find the right one within a mismatch parameter.
#Which one to use?
#Presumably there’s only real barcodes in the library, so I should increase the parameter to the point where I get more reads per strain, but not at the point where strains start to canabalize counts from other strains.
summedCounts <- list()
for (MM in names(mydatar)) {
  summedCounts[[MM]] <- aggregate(Counts~Strain,FUN=sum,
      data=subset(mydatar[[MM]], Replicate!=""))
}
allSummedCounts <- data.table::rbindlist(summedCounts,idcol=T)
dallSummedCounts <- dcast(data=allSummedCounts,Strain~.id,value.var="Counts")

g <- ggplot(dallSummedCounts)+
  scale_y_log10()+scale_x_log10()+theme_bw()+
  geom_point(size=0.1)
g+aes(x=`0MM`,y=`1MM`)
## Warning: Transformation introduced infinite values in continuous y-axis
## Warning: Transformation introduced infinite values in continuous x-axis

g+aes(x=`1MM`,y=`2MM`)
## Warning: Transformation introduced infinite values in continuous y-axis

## Warning: Transformation introduced infinite values in continuous x-axis

pdf("strain counts distribution",width=8,height=6)
dallSummedCounts_m<-melt(dallSummedCounts, id.var="Strain")
ggplot(dallSummedCounts_m,aes(x=value))+
      theme_bw()+scale_x_log10()+
      ylab("Strain counts distribution")+
      geom_density(data=subset(dallSummedCounts_m,variable=="0MM"), aes(colour="0MM"))+
      geom_density(data=subset(dallSummedCounts_m,variable=="1MM"), aes(colour="1MM"))+
      geom_density(data=subset(dallSummedCounts_m,variable=="2MM"), aes(colour="2MM"))
## Warning: Transformation introduced infinite values in continuous x-axis

## Warning: Transformation introduced infinite values in continuous x-axis

## Warning: Transformation introduced infinite values in continuous x-axis
## Warning: Removed 1996 rows containing non-finite values (stat_density).
## Warning: Removed 1866 rows containing non-finite values (stat_density).
## Warning: Removed 1650 rows containing non-finite values (stat_density).
dev.off()
## quartz_off_screen 
##                 2
g <- ggplot(dallSummedCounts)+theme_bw()+
  scale_x_log10()+geom_histogram(bins=50)
g+aes(x=`0MM`)
## Warning: Transformation introduced infinite values in continuous x-axis
## Warning: Removed 1996 rows containing non-finite values (stat_bin).

g+aes(x=`1MM`)
## Warning: Transformation introduced infinite values in continuous x-axis
## Warning: Removed 1866 rows containing non-finite values (stat_bin).

g+aes(x=`2MM`)
## Warning: Transformation introduced infinite values in continuous x-axis
## Warning: Removed 1650 rows containing non-finite values (stat_bin).

#(interpretation)

#And what about the mismatches of sample indicies?

summedBySample <- list()
for (MM in names(mydatar)) {
  summedBySample[[MM]] <- aggregate(Counts~SampleNum+Tag,FUN=sum,
      data=mydatar[[MM]])
}

#
allSummedBySample <- data.table::rbindlist(summedBySample,idcol=T)
allSummedBySample$Used <- ifelse(allSummedBySample$SampleNum%in%index$SampleIndex,"Yep","Nope")
#mySummedSample<- subset(allSummedBySample,allSummedBySample$SampleNum%in%
 # paste("Sample",rownames(index),sep=""))
##this step seems not necessary for the samples mixed from different experiment
#only plot my sample
allSummedBySampleUP<-subset(allSummedBySample, allSummedBySample$Tag == "UP")
ggplot(allSummedBySampleUP)+theme_bw()+
  aes(x=SampleNum,y=log10(Counts),col=Used)+facet_grid(.id~Tag)+
  geom_point()+#scale_y_log10()+
  theme(axis.text.x=element_text(angle=90,size=5,face="bold"),legend.position="top")

allSummedBySampleDOWN<-subset(allSummedBySample, allSummedBySample$Tag == "DOWN")
ggplot(allSummedBySampleDOWN)+theme_bw()+
  aes(x=SampleNum,y=log10(Counts),col=Used)+facet_grid(.id~Tag)+
  geom_point()+#scale_y_log10()+
  theme(axis.text.x=element_text(angle=90,size=5,face="bold"),legend.position="top")

#saving files
for (MM in names(mydatar)) {
  write.csv(file=paste0("N_lim4",MM,"Melted.csv"),
    x=subset(mydatar[[MM]],Replicate!=""))
}
for (MM in names(mydatar)) {
  write.csv(file=paste0("N_lim4",MM,".csv"),
    x=dcast(subset(mydatar[[MM]],Replicate!=""),#,"Starvation","SamplingIndex","Replicate","SamplingTime","BarTag"
      Strain+Tag~SampleNum+LibraryName+StarvingMedia+SampleIndex+Replicate+SamplingTime,
      value.var="Counts"))
}
useMM <- "0MM"
#sdat <- subset(mydatar[[useMM]], Replicate!="")
sdat <- subset(mydatar[[useMM]], SampleNum %in% index$SampleIndex)
sdat[,-c((-1:0)+ncol(sdat))] <- lapply(sdat[,-c((-1:0)+ncol(sdat))],
  factor)

ggplot(sdat)+
  aes(x=SampleNum,
    col=LibraryName:Replicate:SamplingTime,weight=Counts)+
  facet_grid(Tag~.)+geom_point(stat="count")+
  theme(axis.text.x=element_text(angle=90))+
  scale_y_log10()

ggplot(sdat)+
  aes(x=SampleNum:Tag,y=Counts)+
  geom_boxplot()+scale_y_log10()+
  facet_grid(LibraryName~SamplingTime+Replicate,
    scales="free_x",space="free")+
  theme(axis.text.x=element_text(angle=90))
## Warning: Transformation introduced infinite values in continuous y-axis
## Warning: Removed 800752 rows containing non-finite values (stat_boxplot).

#Indexing numeric data: values will be adjusted so they are equal to each other in a given starting time period.
nozerosdat<-subset(sdat,sdat$Counts>0)
##for (i in unique(nozerosdat$Strain)){
  #    s<-subset(nozerosdat, nozerosdat$Strain==i)
  #    apply(s, 1, function(x){return x})}

add up samples UP and DOWN tag

zeroMM<-subset(allSummedBySample, allSummedBySample$.id=="0MM")
zeroMM_sum<-aggregate(zeroMM$Counts ~ zeroMM$SampleNum, FUN = sum)

PCA analysis

Sample_0MM<-allSummedBySample[allSummedBySample$Counts>5e4 & allSummedBySample$.id=="0MM"]
filterData<-subset(sdat, sdat$SampleNum %in% Sample_0MM$SampleNum)
ssdat<-subset(sdat,sdat$Counts >0 & sdat$SampleNum %in% Sample_0MM$SampleNum)
dsdat <- dcast(Strain~SampleNum+Tag+LibraryName+Replicate+SamplingTime+StarvingMedia,
  data=ssdat,value.var="Counts")
dsdat[is.na(dsdat)] <- 0
rownames(dsdat) <- dsdat[,1]
dsdat <- dsdat[,-1]
pc <- prcomp(t(dsdat))
pz <- data.frame(pc$x)
pz[,ncol(pz)+(1:6)] <- colsplit(rownames(pz),"_",
  names=c("SampleNum","Tag","LibraryName","Replicate","SamplingTime","StarvingMedia"))
pz[ncol(pz)-(0:5)] <- lapply(pz[ncol(pz)-(0:5)],factor)

######Plot PCA vs UP and DN tag
g <- ggplot(pz)+geom_point()
g+aes(x=PC1,y=PC2,col=Tag)

g+facet_wrap(~Tag,scales="free")+
  aes(x=PC1,y=PC2,col=LibraryName:SamplingTime:Replicate)+
  theme(legend.position="bottom")

########Plot PCA seperated by tag and library
g <- ggplot(pz)+
  geom_point()+theme(legend.position="bottom")

g+facet_wrap(~Tag,scales="free")+
  aes(x=PC1,y=PC2,col=LibraryName:SamplingTime:Replicate)+
  geom_text_repel(aes(label=LibraryName),alpha=0.5,nudge_x=200)

g+facet_wrap(~LibraryName+Tag,scales="free")+
  aes(x=PC1,y=PC2,col=LibraryName:SamplingTime:Replicate)+
  geom_text_repel(aes(label=LibraryName),alpha=0.5,nudge_x=200)

########Plot PCA seperated by sampling time
g <- ggplot(pz)+
  geom_point()+theme(legend.position="bottom")

g+facet_grid(Tag~SamplingTime,scales="free")+
      aes(x=PC1,y=PC2,col=Replicate)+
      geom_text_repel(aes(label=LibraryName),size=3,alpha=0.5,nudge_y=200)

#g+facet_wrap(Tag~LibraryName+SamplingTime,scales="free")+
  

#plot for PC3vsPC4 -- the less representative components for the data set
g+facet_wrap(~Tag,scales="free")+
  aes(x=PC3,y=PC4,col=LibraryName:SamplingTime:Replicate)+
  geom_text_repel(aes(label=LibraryName),alpha=0.5,nudge_x=200)

g+facet_wrap(~LibraryName+Tag,scales="free")+
  aes(x=PC3,y=PC4,col=LibraryName:SamplingTime:Replicate)+
  geom_text_repel(aes(label=LibraryName),alpha=0.5,nudge_x=200)

#ggplot(mydatar)+theme_bw()+aes(x=(sumz))+
#    geom_histogram(binwidth=50)+facet_grid(time~pulse+replicate)+
#    xlim(0,4e3)